import pandas as pd
import re
from pandas.core.frame import DataFrame
import numpy as np
import csv
import re
import urllib
import pandas as pd
import requests
import time
import os
from multiprocessing import Pool


# 获取所有的网页
def get_urls(url):
    headers = {"user-agent": "Mozilla/5.0"}
    html = requests.get(url, headers=headers).content.decode("utf8")
    res = re.findall("""<a href="/p/\d+\?pn=(\d+)">尾页</a>""", html)
    total_pages = int(res[-1])
    url_lists = []
    for i in range(total_pages):
        pn = i + 1
    # 将字典里面所有的键值转化为query - string格式
        key = urllib.parse.urlencode({"pn": pn})
        full_url = url + key
        url_lists.append(full_url)
    return url_lists


def remove_img(s):
    return re.sub('<img.*?>', '', s)


def parser(url):
    t = int(time.time() * 1000)
    url = url + "&ajax=1&t={}".format(t)
    headers = {"user-agent": "Mozilla/5.0"}
    html_str = requests.get(url, headers=headers).content.decode()
    re1 = 'target="_blank">(.*?)<'
    re2 = 'd_post_content j_d_post_content  clearfix" style="display:;">(.*?)<'
    re3 = '&quot;(2021.*?)&quot;'
    user_list = re.findall(re1, html_str, re.S)
    user_list = list(map(remove_img, user_list))
    content_list = re.findall(re2, html_str, re.S)
    reply_list = re.findall(re3, html_str, re.S)
    # 信息写入csv文件
    with open("results.csv", 'a', encoding='utf-8', newline="") as f:
        csv_writer = csv.writer(f)
        for i in range(len(reply_list)):
            csv_writer.writerows([[user_list[i], content_list[i].strip(), reply_list[i]]])
    print("成功获得贴吧信息！")


# 根据时间排序重写结果csv文件
def get_page(url):
    content = get_html(url)
    pattern = re.compile(r'<a href="/p/.*?pn=(.*?)">尾页</a>')
    return int(re.findall(pattern, content)[0])


def main(url):
    pool = Pool(3)  # 多线程
    url_list = []  # 收集网址
    allpage = get_page(url)
    i = 0
    while (i <= allpage):
        baseurl = url + "?pn=" + str(i)
        url_list.append(baseurl)
        i += 1
    pool.map(get_content, url_list)



if __name__ == '__main__':
    url = "https://tieba.baidu.com/p/7247002707"
    main(url)